#############################################################################
# project: Prioritizing Exceptional Social Needs 
# file:    01_preparation_function.R 
# authors: Michael Jankowski, Brian Dietrich
# task:    data preparation of csv. sata for conjoint analysis
# input:   data_rh.csv, data_uhh.csv, data_hsvn.csv, data_civic.csv
# last     revision: 2023/07/3
# output:  prep_data function, four_surveys data
############################################################################


### install packages
#install.packages("tidyverse")
#install.packages("data.table")
#install.packages("cregg")
#install.packages("Hmisc")
#install.packages("here")
#install.packages("ebal")
#install.packages("cobalt")

###  load packages
library(tidyverse)
library(data.table)
library(cregg)         
library(Hmisc)
library(here)

### set working dictionary
here()


### Path for the four different sample data sets 
four_surveys <- c(rh = "../data/conjoint_data/raw_data/data_rh.csv",
                  uhh = "../data/conjoint_data/raw_data/data_uhh.csv",
                  hsvn = "../data/conjoint_data/raw_data/data_hsvn.csv",
                  civic = "../data/conjoint_data/raw_data/data_civic.csv")


### function for data preparation ###

# The function prepares the data in such a way that we can use the cregg package

prep_data <- function(x){     
  
  sam <- gsub(".*_|.csv", "", x) 
  
  df <- fread(x, data.table = F)
  
  df <- df[!grepl("preview", df$DistributionChannel),]  # exclude all tests cases

  df <- df[as.numeric(df$Progress) == 100 | is.na(as.numeric(df$Progress)),] # exclude incomplete participants
  
  df <- df[df$cj0_daten != "Nein",] # exclude cases who didnt except policy
  
  replace_answer <- function(x) ifelse(grepl("Person A", x), "1", 
                                       ifelse(grepl("Person B", x), "2", x))
  
  df$cj1_choice <- replace_answer(df$cj1_choice) 
  df$cj2_choice <- replace_answer(df$cj2_choice)
  df$cj3_choice <- replace_answer(df$cj3_choice)
  df$cj4_choice <- replace_answer(df$cj4_choice)
  df$cj5_choice <- replace_answer(df$cj5_choice)
  df$cj6_choice <- replace_answer(df$cj6_choice)
  df$cj7_choice <- replace_answer(df$Q87)        
  df$cj8_choice <- replace_answer(df$Q90)        
  
  df$cj4_choice[df$cj4_choice == "x"] <- "1" 
  
  write.table(df,
              file = paste0("../data/conjoint_data/prep_data/prep_csv_", sam, ".csv"),
              row.names = F,
              sep =",")
  
  dfcj <- cjoint::read.qualtrics(paste0("../data/conjoint_data/prep_data/prep_csv_",sam,".csv"), 
                                 responses = paste0("cj", 1:8, "_choice"),
                                 respondentID = "ResponseId",
                                 new.format = TRUE)
  
  # Repair Encoding dimensions
  names(dfcj) <- rvest::repair_encoding(paste(names(dfcj)), from = "utf8")
  
  # Rename Factors
  dfcj <- dfcj %>%
    rename("Age" = "Alter",
           "Citizenship" = "Staatsangehörigkeit",
           "Education" = "Bildungsgrad",
           "Since" = "Dauer.des.ALG.II.Bezuges",
           "Gender" = "Geschlecht",
           "Composition" = "Zusammensetzung.der.Bedarfsgemeinschaft",
           "Unemployment" = "Grund.für.Arbeitslosigkeit",
           "Supportive" = "Grad.der.Mitwirkung.(zum.Beispiel.Pünktlichkeit.oder.Vollständigkeit.der.Unterlagen)")
  
  # Sample Variable
  dfcj$Sample <- paste0(sam, "\n(N = ",nrow(dfcj)/16,")")
  
  # Repair Encoding Attribute 
  dfcj <- modify_if(dfcj, 
                    is.factor, 
                    function(x) rvest::repair_encoding(paste(x), from = "utf8"))
  
  # Rename attributes
  dfcj$Gender[dfcj$Gender == "Männlich"] <- "Male" 
  dfcj$Gender[dfcj$Gender == "Weiblich"] <- "Female" 
  dfcj$Age <- gsub("Jahre", "Years", dfcj$Age) 
  dfcj$Education[dfcj$Education == "Universität"] <- "University" 
  dfcj$Education[dfcj$Education == "Ungelernt"] <- "Unskilled" 
  dfcj$Education[dfcj$Education == "Fachhochschule"] <- "Technical College" 
  dfcj$Education[dfcj$Education == "Berufsausbildung"] <- "Vocational Training" 
  dfcj$Citizenship[dfcj$Citizenship == "Vietnam"] <- "Vietnam" 
  dfcj$Citizenship[dfcj$Citizenship == "Türkei"] <- "Turkey" 
  dfcj$Citizenship[dfcj$Citizenship == "Syrien"] <- "Syria" 
  dfcj$Citizenship[dfcj$Citizenship == "Rumänien"] <- "Romania" 
  dfcj$Citizenship[dfcj$Citizenship == "Frankreich"] <- "France" 
  dfcj$Citizenship[dfcj$Citizenship == "Deutschland"] <- "Germany" 
  dfcj$Composition[dfcj$Composition == "2 Erwachsene, 3 Kinder"] <- "2 Adults, 3 Childr." 
  dfcj$Composition[dfcj$Composition == "2 Erwachsene, 1 Kind"] <- "2 Adults, 1 Child" 
  dfcj$Composition[dfcj$Composition == "2 Erwachsene"] <- "2 Adults" 
  dfcj$Composition[dfcj$Composition == "1 Erwachsene/r, 2 Kinder"] <- "1 Adult, 2 Childr." 
  dfcj$Composition[dfcj$Composition == "1 Erwachsene/r"] <- "1 Adult" 
  dfcj$Since[grepl("0,5 Jahre", dfcj$Since)] <- "6 Months" 
  dfcj$Since[grepl("3 Jahre", dfcj$Since)] <- "3 Years" 
  dfcj$Since[grepl("18 Monate", dfcj$Since)] <- "1.5 Years" 
  dfcj$Since[grepl("12 Monate", dfcj$Since)] <- "1 Year" 
  dfcj$Unemployment[grepl("Fehlverhalten", dfcj$Unemployment)] <- "Fired for wrongdoing" 
  dfcj$Unemployment[grepl("Unfalls", dfcj$Unemployment)] <- "Disabled due to accident" 
  dfcj$Unemployment[grepl("Insolvenz", dfcj$Unemployment)] <- "Bankruptcy of employer" 
  dfcj$Unemployment[grepl("Freiwillige", dfcj$Unemployment)] <- "Voluntarily dismissal" 
  dfcj$Supportive[grepl("Mittlere", dfcj$Supportive)] <- "Medium"
  dfcj$Supportive[grepl("Hohe", dfcj$Supportive)] <- "High"
  dfcj$Supportive[grepl("Geringe", dfcj$Supportive)] <- "Low"
  
  # Rearrange the order of the factors
  dfcj$Gender <- factor(dfcj$Gender, levels = c("Male", "Female"))
  dfcj$Age <- factor(dfcj$Age, levels = paste(c(23, 36, 48, 57), "Years"))
  dfcj$Education <- factor(dfcj$Education, 
                           levels = c("Unskilled", "Vocational Training","Technical College", "University"))
  dfcj$Citizenship <- factor(dfcj$Citizenship, 
                             levels = c("Vietnam", "Syria", "Romania", "Turkey",
                                        "France", "Germany"))
  dfcj$Composition <- factor(dfcj$Composition, 
                             levels = c("1 Adult", "2 Adults", "2 Adults, 1 Child", "1 Adult, 2 Childr.", "2 Adults, 3 Childr."))
  dfcj$Since <- factor(dfcj$Since, levels = c("6 Months", "1 Year", "1.5 Years", "3 Years"))
  dfcj$Unemployment <- factor(dfcj$Unemployment, 
                              levels = c(
                              "Fired for wrongdoing",
                              "Voluntarily dismissal",
                              "Bankruptcy of employer",
                              "Disabled due to accident"))
  dfcj$Supportive <- factor(dfcj$Supportive, levels = c("Low", "Medium", "High"))
  
  dfcj$respondentIndex <- dfcj$Response.ID
  
  dfcj <- dfcj %>% # delete ID and respodnent
    select(-Response.ID, -respondent)
  
  names_col <- names(dfcj)
  
  for(i in 2:length(names_col)){
    
    if(grepl("rowpos", names_col[i])) names_col[i] <- paste0(names_col[(i-1)],"_rowpos")
    
  }
  
  return(dfcj)
  
} 

################# End ##################